1 System Info

# session info
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] compiler_3.6.0  magrittr_1.5    tools_3.6.0     htmltools_0.3.6
##  [5] yaml_2.2.0      Rcpp_1.0.1      stringi_1.4.3   rmarkdown_1.12 
##  [9] knitr_1.22      stringr_1.4.0   xfun_0.6        digest_0.6.18  
## [13] evaluate_0.13

2 Load Libraries

library(tibble)
library(magrittr)
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
## 
##     extract
library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(DT)

3 Load Data

# Read in all the data
# Rel Abund = (Sum of Contig count)/(QC sequence abundace)
MAGnum_df <- read.delim("input_matrix_v2.txt") %>%
    tibble::column_to_rownames(var = "X")

MAGnum_mat <- MAGnum_df %>%
    data.matrix()

MAGnum_count <- read.table("input_matrix_v1.counts.txt", sep="\t", header=TRUE)
rownames(MAGnum_count) <- MAGnum_count[,1]
MAGnum_count<-MAGnum_count[,-1]
head(MAGnum_count)
# read in the Bin stats
bin_stats <- read.table("bin_stats.txt", sep="\t", header=TRUE);
rownames(bin_stats)<-bin_stats[,1]
bin_stats<- bin_stats[,-1]

# Select the Top X abundance bins
abundant_mags <- MAGnum_mat[rowMeans(MAGnum_mat) > 0.0001, ]

# read in the Sample metadata 
# Add a new column with color information for plotting
metadat <- 
  read.table("Sample_metadata.txt", sep = "\t", header = TRUE) %>%
  mutate(colors = ifelse(Location == "Alaska", "cornflowerblue",
                         ifelse(Location == "Bodega Bay", "red",
                                ifelse(Location == "Japan North", "darkorchid",
                                       ifelse(Location == "Japan South", "darkorchid1",
                                              ifelse(Location == "Massachusetts", "deeppink",
                                                     ifelse(Location == "North Carolina", "darkslategray3",
                                                            ifelse(Location == "Quebec", "darkorange",
                                                                   ifelse(Location == "San Diego", "green4",
                                                                          ifelse(Location == "Washington", "ivory4",
                                                                                 ifelse(Location == "Norway", "firebrick",
                                                                                        ifelse(Location == "Sweden", "dodgerblue4",
                                                                                               ifelse(Location == "Wales", "gold2",
                                                                                                      ifelse(Location == "Portugal", "forestgreen",
                                                                                                             ifelse(Location == "French Med", "deepskyblue",
                                                                                                                    ifelse(Location == "Croatia", "black",
                                                                                                                                  NA))))))))))))))))

# Set the colors of the different columns going in
colz <- metadat$colors

location_colors <- c(
  Alaska = "cornflowerblue",
  "Bodega Bay" = "red",
  "Japan North" = "darkorchid",
  "Japan South" = "darkorchid1",
  Massachusetts = "deeppink",
  "North Carolina" = "darkslategray3",
  Quebec = "darkorange",
  "San Diego" =  "green4",
  Washington = "ivory4",
  Norway = "firebrick",
  Sweden = "dodgerblue4",
  Wales = "gold2",
  Portugal = "forestgreen",
  "French Med" = "deepskyblue",
  Croatia = "black")

# Set the color palette
my_palette <- colorRampPalette(c("red","white ", "blue"))(n = 299)

4 Normalization

# Normalizing the matrix 
# Dividing rows by the genome size for the bin
# Genome size is different from Bin size
# Genome size is BinSize * BinCompletion

# Calculate the genome size
genome_sizes<- bin_stats$size * bin_stats$SCG_completeness

# Do the normalization
MAGnum_norm<-t(t(MAGnum_count) / genome_sizes)

5 Plot

5.1 Raw relative abundance data

# Without any scaling 
heatmap.2(MAGnum_mat, 
          main = "NOT scaled Relative Abundance Data",
          distfun = function(x) dist(x, method = "euclidean"),
          hclustfun = function(x) hclust(x,method = "complete"),
          #scale = "row",
          col = my_palette,
          trace = "none",
          ColSideColors = colz,
          key = TRUE, symkey = FALSE, 
          density.info = "none", srtCol = 30,
          margins=c(5.5,7), cexRow = 1.25, cexCol = 1.25,
          key.xlab = "MAG Abundance",
          lhei = c(1.5,9), 
          key.title=NA)

# Add legend
legend(y=0.75, x=0.05, 
       xpd=TRUE,     
       legend = unique(metadat$Location),
       col = unique(metadat$colors), 
       lty= 1, lwd = 5, cex=.7)

5.2 Scaled relative abundance

heatmap.2(MAGnum_mat, 
          main="Relative abundance scaled",
        distfun = function(x) dist(x, method = "euclidean"),
        hclustfun = function(x) hclust(x,method = "complete"),
        scale = "row",
        col = my_palette, 
        trace = "none",
        key = TRUE, symkey = FALSE, 
        density.info = "none", srtCol = 30,
        margins=c(5.5,7), cexRow = 1.25,cexCol = 1.25,
        key.xlab = "MAG Abundance",
        lhei = c(1.5,9), 
        key.title=NA,
        ColSideColors = colz)

# Add legend
legend(y=0.75, x=0.05, 
       xpd=TRUE,     
       legend = unique(metadat$Location),
       col = unique(metadat$colors), 
       lty= 1, lwd = 5, cex=.7)

5.3 Scaled normalized data

5.3.1 Counts per Bin Per sample divided by genome size

# WITH scaling 
heatmap.2(MAGnum_norm, 
          main = "Scaled Normalized Data",
          distfun = function(x) dist(x, method = "euclidean"),
          hclustfun = function(x) hclust(x,method = "complete"),
          scale = "row",
          col = my_palette,
          trace = "none",
          key = TRUE, symkey = FALSE, 
          density.info = "none", srtCol = 30,
          margins=c(5.5,7), cexRow = 1.25, cexCol = 1.25,
          key.xlab = "MAG Abundance",
          lhei = c(1.5,9), 
          key.title=NA,
          ColSideColors = colz)

# Add legend
legend(y=0.75, x=0.05, 
       xpd=TRUE,     
       legend = unique(metadat$Location),
       col = unique(metadat$colors), 
       lty= 1, lwd = 5, cex=.7)

5.4 Scaled Meanm Abundance MAGs > 0.0001

# WITH scaling 
heatmap.2(abundant_mags, 
          main = "Scaled Mean Top Abundant MAGs",
          distfun = function(x) dist(x, method = "euclidean"),
          hclustfun = function(x) hclust(x,method = "complete"),
          scale = "row",
          col = my_palette,
          trace = "none",
          key = TRUE, symkey = FALSE, 
          density.info = "none", srtCol = 30,
          margins=c(5.5,7), cexRow = 1.25, cexCol = 1.25,
          key.xlab = "MAG Abundance",
          lhei = c(1.5,9), 
          key.title=NA,
          ColSideColors = colz)

legend(y=0.75, x=0.05, 
       xpd=TRUE,     
       legend = unique(metadat$Location),
       col = unique(metadat$colors), 
       lty= 1, lwd = 5, cex=.7)

6 Top MAG Abundance Plot

# Select the Top X abundance bins
top_mags <- 
  MAGnum_mat[rowMeans(MAGnum_mat) > 0.001, ]

# How many MAGs?
num_MAGs <- nrow(top_mags)

# Prepare dataframe for plotting
abund_df <- top_mags %>%
  t() %>%
  as.data.frame() %>%
  tibble::rownames_to_column(var = "MAG") %>%
  rename(Sample = MAG) %>%
  gather("MAG", "norm_abund", 2:(num_MAGs+1)) %>%
  left_join(metadat, by = "Sample") %>%
  dplyr::select(-colors) 
## Warning: Column `Sample` joining character vector and factor, coercing into
## character vector
# What's the data frame look like?
datatable(abund_df, options = list(pageLength = 10))
# Plot each MAG by location
MAG_abund_plot <- 
  ggplot(abund_df, 
       aes(x = Location, y = norm_abund, color = Location, fill = Location)) + 
  geom_jitter() + 
  #geom_boxplot(alpha = 0.4, outlier.shape = NA) + 
  theme_minimal() + 
  scale_color_manual(values = location_colors) +
  scale_fill_manual(values = location_colors) +
  facet_wrap(~MAG, scales = "free") +
  labs(y = "Normalized MAG Abundance") + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1))

# Interactive Plot 
ggplotly(MAG_abund_plot) # Make the plot interactive